import urllib.parse

import scrapy
import time
import os
import random
import threading

from Knowledge_Planet.untils import download_fujian
from urllib.parse import urlencode
from Knowledge_Planet.items import InitialPageItem
from datetime import datetime
import re

class GetDataSpider(scrapy.Spider):
    name = 'get_data'

    # allowed_domains = ['https://wx.zsxq.com']
    # start_urls = ['https://wx.zsxq.com/']

    def __init__(self, cookie=None, end_time=None):
        self.status_cookie = cookie
        self.end_time = end_time
        self.item_lock = threading.Lock()
        self.file_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'OtherFiles')
        self.images_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
        self.head_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'HeadImages')
        self.from_website = "http://dahanghai6.fuyuzhe.com.cn/"
        self.headers = {
            'referer': 'https://wx.zsxq.com/',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        }

    def start_requests(self):
        if not os.path.exists(self.file_path):
            os.makedirs(self.file_path)
        if self.end_time == '':
            params = {
                'scope': 'all',
                'count': '20',
            }
        else:
            params = {
                'scope': 'all',
                'count': '20',
                'end_time': self.end_time,
            }
        yield scrapy.Request(url="https://api.zsxq.com/v2/groups/518841428244/topics?" + urlencode(params),
                         cookies=self.status_cookie, callback=self.parse_data, dont_filter=True)

    def parse_data(self, response):
        jsonItem = response.json()['resp_data']
        if jsonItem:
            if len(jsonItem['topics']) > 1:
                itemList = jsonItem['topics']
                for resItem in itemList:
                    # create_time = re.sub('T', ' ', resItem["create_time"])[:19]
                    create_time = resItem.get("create_time", "")
                    topic_id = resItem.get("topic_id", "")
                    group_info = resItem.get("group", {})
                    content_type = resItem.get("type", "")
                    tagparran = re.compile('<e type=[\s\S]*?title="(.*?)"')
                    # tags = tagparran.findall(talk_text, re.S)

                    talk_images = []
                    talk_files = []
                    user_headimg = ''

                    if "talk" in resItem.keys():
                        talk_text = resItem["talk"]["text"]
                        talk_owner = resItem["talk"]["owner"]
                        # 替换掉tag标签 <e type="web" href="https://ahrefs.com/zh/seo" title="SEO初学者完整指南" />
                        hrefs = re.findall('<e type="web" href="(.*?)"[\s\S]*?/>', talk_text)
                        hrefLinks = []
                        if len(hrefs) > 0:
                            for href in hrefs:
                                t = urllib.parse.unquote(href)
                                hrefLinks.append(t)
                        titles = re.findall('<e type="web" href="[\s\S]*?" title="(.*?)" />', talk_text)
                        titleLinks = []
                        if len(titles) > 0:
                            for title in titles:
                                t = urllib.parse.unquote(title)
                                titleLinks.append(t)
                        # 遍历链接，逐项替换内容 <e type="web" href="https://ahrefs.com/zh/seo" title="SEO初学者完整指南" />
                        if len(hrefLinks) > 0:
                            for url, title in zip(hrefLinks, titleLinks):
                                thnr = '<a href="' + url + '" />' + title + '</a>'
                                if len(hrefLinks) > 0:
                                    talk_text = re.sub('<e type="web" href=[\s\S]*? />', thnr,
                                                       urllib.parse.unquote(talk_text), 1)
                                else:
                                    talk_text = re.sub('<e type="web" href=[\s\S]*? />', thnr,
                                                       urllib.parse.unquote(talk_text))
                        tags = re.findall('<e type=[\s\S]*?title="(.*?)"[\s\S]*?/>', talk_text)
                        # 获取tags
                        hashtags = []
                        if len(tags) > 0:
                            for tag in tags:
                                t = re.sub('#', '', urllib.parse.unquote(tag))
                                hashtags.append(t)
                        # 替换标签
                        talk_text = re.sub('<e type="hashtag" hid=[\s\S]*? />', '', urllib.parse.unquote(talk_text))
                        # 替换换行符
                        talk_text = re.sub('\n', '<br>', urllib.parse.unquote(talk_text))
                        # 标题链接转换
                        latest_likes = resItem.get("latest_likes", [])
                        likes_count = resItem.get("likes_count", 0)
                        show_comments = resItem.get("show_comments", [])
                        # 转换评论内部链接
                        # "我记得<e type="mention" uid="88512845288152" title="@%E8%87%AD%E7%8C%B4%E5%AD%90%F0%9F%90%92%F0%9F%8C%9F" /> 翻译到德文有一段时间了"
                        comments = []
                        if len(show_comments) > 0:
                            for comment in show_comments:
                                commont_titles = re.findall('<e type="web" href="[\s\S]*?" title="(.*?)" />', comment['text'])
                                commont_hrefs = re.findall('<e type="web" href="(.*?)"[\s\S]*?/>', comment['text'])
                                commont_mention = re.findall('<e type="mention" uid="[\s\S]*?" title="(.*?)"[\s\S]*?/>', comment['text'])
                                if len(commont_mention) > 0:
                                    for mention in commont_mention:
                                        comment['text'] = re.sub('<e type="mention"[\s\S]*?/>', mention,
                                                                 urllib.parse.unquote(comment['text']))
                                if len(commont_titles) > 0:
                                    for url, title in zip(commont_titles, commont_hrefs):
                                        comment_t = '<a href="' + url + '" />' + title + '</a>'
                                        comment_t = urllib.parse.unquote(comment_t)
                                        if len(commont_hrefs) > 0:
                                            comment['text'] = re.sub('<e type="web" href=[\s\S]*? />', comment_t,
                                                               urllib.parse.unquote(comment['text']), 1)
                                        else:
                                            comment['text'] = re.sub('<e type="web" href=[\s\S]*? />', comment_t,
                                                               urllib.parse.unquote(comment['text']))
                                        comment['text'] = comment['text'].strip("\n")
                                        comments.append(comment)


                        rewards_count = resItem.get("rewards_count", 0)
                        comments_count = resItem.get("comments_count", 0)
                        reading_count = resItem.get("reading_count", 0)
                        readers_count = resItem.get("readers_count", 0)
                        digested = resItem.get("digested", 0)
                        sticky = resItem.get("sticky", 0)
                        user_specific = resItem.get("user_specific", {})
                        uid = resItem["talk"]["owner"]["user_id"]
                        className = resItem.get("className", "A")
                        if 'files' in resItem['talk'].keys():
                            file_list = resItem['talk']['files']
                            for file_dic in file_list:
                                file_id = file_dic['file_id']
                                file_name = file_dic['name']
                                file_path = self.file_path + "/" + file_name
                                download_fujian.get_fujian(file_id, file_path, self.status_cookie)
                                talk_files.append(self.from_website + 'OtherFiles' + '/' + file_name)
                                time.sleep(random.randint(1, 3))
                        # 获取用户头像
                        if 'owner' in resItem['talk'].keys():
                            head_url = resItem['talk']['owner']['avatar_url']
                            head_name = self.head_path + '/' + str(resItem['talk']['owner']['user_id']) + '.' + 'png'
                            download_fujian.get_headimage(head_url, head_name)
                            user_headimg = self.from_website + 'HeadImages' + '/' + str(resItem['talk']['owner']['user_id']) + '.' + 'png'
                            time.sleep(random.randint(1, 3))
                        # 获取内容图片
                        if 'images' in resItem['talk'].keys():
                            topic_id = resItem['topic_id']
                            img_list = resItem['talk']['images']
                            for index, img_dic in enumerate(img_list):
                                img_name = self.images_path + '/' + str(topic_id) + "_" + str(index) + "." + img_dic['type']
                                if 'original' in img_dic:
                                    img_url = img_dic['original']['url']
                                else:
                                    img_url = img_dic['large']['url']
                                download_fujian.get_images(img_url, img_name)
                                talk_images.append(self.from_website + 'images' + '/' + str(topic_id) + "_" + str(index) + "." + img_dic['type'])
                                time.sleep(random.randint(1, 3))
                    else:
                        talk_text = ""
                        talk_owner = {}

                    item = InitialPageItem(
                        create_time=create_time,
                        topic_id=topic_id,
                        group_info=group_info,
                        content_type=content_type,
                        talk_owner=talk_owner,
                        talk_text=talk_text,
                        talk_images=talk_images,
                        latest_likes=latest_likes,
                        show_comments=show_comments,
                        likes_count=likes_count,
                        rewards_count=rewards_count,
                        comments_count=comments_count,
                        digested=digested,
                        sticky=sticky,
                        user_specific=user_specific,
                        hashtags=hashtags,
                        uid=uid,
                        className=className,
                        reading_count=reading_count,
                        readers_count=readers_count,
                        talk_files=talk_files,
                        user_headimg=user_headimg
                    )

                    yield item
                # 继续采集，直到获取不到数据
                with open('topic_id.txt', 'r', encoding='utf-8') as f1:
                    txt_list = f1.readlines()
                    end_time = txt_list[-1].strip().split('|')[-1]
                    # end_time = ""
                    params = {
                        'scope': 'all',
                        'count': '20',
                        'end_time': end_time,
                    }
                    print("采集时间--%s" % end_time)
                    yield scrapy.Request(
                        url="https://api.zsxq.com/v2/groups/518841428244/topics?" + urlencode(params),
                        cookies=self.status_cookie, callback=self.parse_data, dont_filter=True)
            else:
                print('抓取结束！！！')
        # 下载附件到指定文件夹
        # if response.json()["succeeded"] == True:
        #     res_list = response.json()['resp_data']['topics']
        #


